In [ ]:
    
%%HTML
<style>
.container { width:100% }
</style>
    
In [ ]:
    
import nltk
    
In [ ]:
    
# nltk.download()
    
In [ ]:
    
from nltk.corpus import gutenberg
    
Show the titles of those books that are part of the gutenberg corpus.
In [ ]:
    
gutenberg.fileids()
    
Retrieve the book "Paradise Lost" by John Milton.
In [ ]:
    
print(gutenberg.raw('milton-paradise.txt'))
    
Retrieve the book "Paradise Lost" by John Milton as a list of sentences.
In [ ]:
    
print(gutenberg.sents('milton-paradise.txt'))
    
Retrieve the book "Paradise Lost" by John Milton as a list of words.
In [ ]:
    
print(gutenberg.words('milton-paradise.txt'))
    
In [ ]:
    
from nltk.corpus import brown
    
In [ ]:
    
brown.ensure_loaded()
help(brown)
    
In [ ]:
    
brown.fileids()
    
In [ ]:
    
brown.categories()
    
In [ ]:
    
brown.sents(categories='editorial')
    
In [ ]:
    
from nltk.tokenize import sent_tokenize
    
In [ ]:
    
generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.'
    
The function sent_tokenize splits a string into a list of sentences.
In [ ]:
    
sent_tokenize(generic_text)
    
In [ ]:
    
english_text = 'Where is the closest train station? I need to reach London.'
sent_tokenize(english_text)
    
In [ ]:
    
spanish_text = '¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.'
sent_tokenize(spanish_text, language='spanish')
    
In [ ]:
    
from nltk.tokenize import TreebankWordTokenizer
    
In [ ]:
    
simple_text = 'This is a simple text.'
tbwt        = TreebankWordTokenizer()
tbwt.tokenize(simple_text)
    
In [ ]:
    
help(TreebankWordTokenizer)
    
In [ ]:
    
from nltk.tokenize import RegexpTokenizer
    
In [ ]:
    
complex_text = "This isn't a simple text."
    
In [ ]:
    
ret = RegexpTokenizer('[a-zA-Z\']+')
    
In [ ]:
    
ret.tokenize(complex_text)
    
In [ ]:
    
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
    
In [ ]:
    
ret.tokenize(complex_text)
    
In [ ]:
    
from nltk.corpus import stopwords
    
In [ ]:
    
sw = set(stopwords.words('english'))
sw
    
In [ ]:
    
len(sw)
    
In [ ]:
    
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
tokens = ret.tokenize(complex_text)
clean_tokens = [t for t in tokens if t not in sw]
clean_tokens
    
In [ ]:
    
from langdetect import detect, detect_langs
    
In [ ]:
    
detect('This is German')
    
In [ ]:
    
detect_langs('I really love you mon doux amour!')
    
In [ ]:
    
detect('I really love you mon doux amour!')
    
In [ ]:
    
from nltk.stem.snowball import SnowballStemmer
    
In [ ]:
    
ess = SnowballStemmer('english', ignore_stopwords=True)
    
In [ ]:
    
ess.stem('flies')
    
In [ ]:
    
from nltk.stem.snowball  import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
    
In [ ]:
    
ps = PorterStemmer()
ps.stem('teeth')
    
In [ ]:
    
ls = LancasterStemmer()
ls.stem('teeth')
    
In [ ]:
    
from sklearn.feature_extraction.text import CountVectorizer
    
In [ ]:
    
corpus = [ 'This is a simple test corpus',
           'A corpus is a set of text documents',
           'We want to analyze the corpus and the documents',
           'Documents can be automatically tokenized'
         ]
    
In [ ]:
    
cv = CountVectorizer()
    
In [ ]:
    
vectorized_corpus = cv.fit_transform(corpus)
    
In [ ]:
    
vectorized_corpus.todense()
    
In [ ]:
    
cv.vocabulary_
    
In [ ]:
    
    
In [ ]: